import pandas as pds


def preprocess_data(file_path):
    df = pds.read_excel(file_path, header=None)
    # 去掉第一列,然后去重
    df_tmp = df.drop([0], axis=1).drop_duplicates()
    key_list = df_tmp.ix[0].as_matrix()  # 获取商品编号
    value_list = df_tmp.ix[1].as_matrix()  # 获取商品中文名称
    goods_dict = {}
    # 获取商品业务字典
    for i in range(0, len(key_list)):
        goods_dict[key_list[i]] = value_list[i]
    # 去掉第一行和第二行,并转换为二维数组
    shopping_info = df_tmp.drop([0, 1]).as_matrix()
    for item in shopping_info:
        for g in range(0, len(item)):
            if item[g] == "T":
                item[g] = key_list[g]
            else:
                item[g] = 0

    list1 = []
    for j in shopping_info:
        list2 = []
        for k in range(0, len(j)):
            if j[k] != 0:
                list2.append(j[k])
            else:
                continue
        list1.append(list2)
    return list1

